In [1]:
# Import necessary modules
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import matplotlib.dates as mdates
import seaborn as sns 
import os
import sys
import time
import math
import re
import string
from bs4 import BeautifulSoup
import pickle
import joblib
from joblib import dump, load

import nltk as nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import spacy 
from spacy import displacy
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from pprint import pprint
import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

from bertopic import BERTopic
import torch
import tensorflow as tf
import ipywidgets

from pandarallel import pandarallel
import multiprocessing
from multiprocessing import Pool

import warnings
warnings.filterwarnings(action = 'ignore', category = UserWarning, module = 'gensim')

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)
%matplotlib inline
2023-05-26 23:53:26.858603: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-26 23:53:29.504505: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2023-05-26 23:53:29.504631: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2023-05-26 23:53:29.504644: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
2023-05-26 23:53:31.644557: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-26 23:53:31.647956: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-26 23:53:31.651216: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-26 23:53:31.654392: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-26 23:53:31.658602: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-26 23:53:31.661352: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-26 23:53:31.664076: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-26 23:53:31.666743: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-26 23:53:31.669509: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-26 23:53:31.672257: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-26 23:53:31.674936: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-26 23:53:31.677610: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
In [2]:
num_processors = multiprocessing.cpu_count()
workers = num_processors-1
print(f'Available CPUs: {num_processors}')
Available CPUs: 96
In [3]:
pandarallel.initialize(progress_bar=True, nb_workers=num_processors-1, use_memory_fs=False)
INFO: Pandarallel will run on 95 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
In [4]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Out[4]:
True
In [5]:
%%time
df_yelp_sentiment = pd.read_parquet('df_yelp_sentiment.parquet', engine = 'pyarrow')
df_yelp_sentiment.shape
CPU times: user 4.79 s, sys: 3.04 s, total: 7.82 s
Wall time: 7.72 s
Out[5]:
(200284, 5)
In [6]:
df_yelp_sentiment.head()
Out[6]:
date clean_title clean_text sentiment sentiment_category
0 2021-03-18 Artificial intelligence improves parking efficiency in Chinese cities - People's Daily Online Artificial intelligence improves parking efficiency in Chinese cities - People's Daily Online Ho... 1 Positive
1 2020-02-27 Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Rob... Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Rob... 0 Negative
2 2021-03-26 Forget ML, AI and Industry 4.0 – obsolescence should be your focus - 26 February 2021 - Test & R... Forget ML, AI and Industry 4.0 – obsolescence should be your focus - 26 February 2021 - Test & R... 0 Negative
3 2021-03-10 Strategy Analytics: 71% of Smartphones Sold Globally in 2021 will be AI Powered – Consumer Elect... Strategy Analytics: 71% of Smartphones Sold Globally in 2021 will be AI Powered – Consumer Elect... 0 Negative
4 2020-10-20 Olympus to Support Endoscopic AI Diagnosis Education for Doctors in India and to Launch AI Diagn... Olympus to Support Endoscopic AI Diagnosis Education for Doctors in India and to Launch AI Diagn... 0 Negative

Topic Modeling¶

Using BERTopic Modeling¶

In [7]:
df_topic_BERT = df_yelp_sentiment.copy()
In [8]:
df_topic_BERT.head()
Out[8]:
date clean_title clean_text sentiment sentiment_category
0 2021-03-18 Artificial intelligence improves parking efficiency in Chinese cities - People's Daily Online Artificial intelligence improves parking efficiency in Chinese cities - People's Daily Online Ho... 1 Positive
1 2020-02-27 Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Rob... Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Rob... 0 Negative
2 2021-03-26 Forget ML, AI and Industry 4.0 – obsolescence should be your focus - 26 February 2021 - Test & R... Forget ML, AI and Industry 4.0 – obsolescence should be your focus - 26 February 2021 - Test & R... 0 Negative
3 2021-03-10 Strategy Analytics: 71% of Smartphones Sold Globally in 2021 will be AI Powered – Consumer Elect... Strategy Analytics: 71% of Smartphones Sold Globally in 2021 will be AI Powered – Consumer Elect... 0 Negative
4 2020-10-20 Olympus to Support Endoscopic AI Diagnosis Education for Doctors in India and to Launch AI Diagn... Olympus to Support Endoscopic AI Diagnosis Education for Doctors in India and to Launch AI Diagn... 0 Negative
In [9]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

# keep only letters, periods, and white space
df_topic_BERT['clean_text'] = df_topic_BERT['clean_text'].parallel_apply(lambda x: re.sub(r'[^a-zA-Z.\s]', '',x))

# change to lower case
df_topic_BERT['clean_text'] = df_topic_BERT['clean_text'].parallel_apply(lambda x: x.lower())

# remove stop words
df_topic_BERT['clean_text'] = df_topic_BERT['clean_text'].parallel_apply(lambda x: ' '.join([word for word in x.split()if word not in (stopwords)]))
VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2109), Label(value='0 / 2109'))), …
VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2109), Label(value='0 / 2109'))), …
VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2109), Label(value='0 / 2109'))), …
In [10]:
skip_list = ['Data Science', 'data science', 'DATA SCIENCE', 'AI', 'ai', 'artificial intelligence', 'Artificial Intelligence', 'ARTIFICIAL INTELLIGENCE', 'ML', 'NLP', 
             'Artificial General Intelligence', 'Chatbots', 'AI Marketplaces', 'Intelligent Applications', 'Augmented Intelligence', 'Decision Intelligence', 
             'AI Cloud Services', 'cloud services', 'GPU Accelerators', 'Computer Vision', 'Deep Neural Network', 'Deep Learning', 'Cognitive Computing', 
             'Autonomous Vehicles', 'Knowledge Graphs', 'Responsible AI', 'Machine Customers', 'Decision Intelligence', 'Autonomous Vehicles', 'Human-Centered AI', 
             'AI Governance', 'Natural Language Processing', 'Machine Learning', 'Smart Robots', 'Operational AI Systems', 'Data-Centric AI', 'AI TRiSM', 
             'Generative AI', 'Responsible AI']
In [11]:
stopwords = set(STOPWORDS)
stopwords.update(skip_list)
df_topic_BERT['clean_text'] = df_topic_BERT['clean_text'].parallel_apply(lambda x: ' '.join([word for word in x.split()if word not in (stopwords)]))
VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2109), Label(value='0 / 2109'))), …
In [12]:
df_topic_BERT.head()
Out[12]:
date clean_title clean_text sentiment sentiment_category
0 2021-03-18 Artificial intelligence improves parking efficiency in Chinese cities - People's Daily Online artificial intelligence improves parking efficiency chinese cities peoples daily online home chi... 1 Positive
1 2020-02-27 Children With Autism Saw Their Learning and Social Skills Boosted After Playing With This AI Rob... children autism saw learning social skills boosted playing robot news parliament skip content th... 0 Negative
2 2021-03-26 Forget ML, AI and Industry 4.0 – obsolescence should be your focus - 26 February 2021 - Test & R... forget ml industry . obsolescence focus february test rework solutions dataweek home us back iss... 0 Negative
3 2021-03-10 Strategy Analytics: 71% of Smartphones Sold Globally in 2021 will be AI Powered – Consumer Elect... strategy analytics smartphones sold globally powered consumer electronics net skip content consu... 0 Negative
4 2020-10-20 Olympus to Support Endoscopic AI Diagnosis Education for Doctors in India and to Launch AI Diagn... olympus support endoscopic diagnosis education doctors india launch diagnostic support applicati... 0 Negative
In [13]:
X = df_topic_BERT['clean_text'].values
In [14]:
%%time 
model = BERTopic(verbose=True, nr_topics= 8,  n_gram_range=(1, 3))
topics, probabilities = model.fit_transform(X)

# 1hr 42mins
Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]
Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]
Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]
Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]
Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]
Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]
Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]
Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]
Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]
Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]
Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]
Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]
Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]
Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]
Batches:   0%|          | 0/6259 [00:00<?, ?it/s]
2023-05-27 00:09:51,390 - BERTopic - Transformed documents to Embeddings
2023-05-27 00:13:37,859 - BERTopic - Reduced dimensionality
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2023-05-27 00:14:02,305 - BERTopic - Clustered reduced embeddings
2023-05-27 00:58:34,837 - BERTopic - Reduced number of topics from 3939 to 8
CPU times: user 4h 58min 44s, sys: 1h 27s, total: 5h 59min 12s
Wall time: 1h 3s
In [16]:
# Save model as a file using pickle
with open('bert_model.pkl', 'wb') as f:
    pickle.dump(model, f)
In [17]:
# Load model from the saved file
with open('bert_model.pkl', 'rb') as f:
    model = pickle.load(f)
In [18]:
model.visualize_barchart()
In [19]:
model.visualize_topics()
In [20]:
model.get_topic_info()
Out[20]:
Topic Count Name
0 -1 49914 -1_market_news_us_data
1 0 150062 0_market_news_us_data
2 1 163 1_javascript_javascript disabled current_current browser configurationis_seeking alpha javascript
3 2 47 2_us_exports_export_analyzing satellite
4 3 40 3_enter valid_valid_enter_axios
5 4 30 4_shielding_shielding materials_emi_market
6 5 17 5_market_post covid update_covid update global_update global
7 6 11 6_center backup software_center backup_backup software_data center backup
In [21]:
topics_keywords = model.get_topics()
keywords_df_list = []
for i in range(len(topics_keywords)-1):
    topic_df = pd.DataFrame(topics_keywords[i])
    keywords_df_list.append(topic_df)
keywords_list = pd.concat(keywords_df_list)[0]
keywords_list_br = [keywords_list[i:i + 10] for i in range(0, len(keywords_list), 10)]
keywords_df = pd.DataFrame(keywords_list_br, index=['Topic_0', 'Topic_1', 'Topic_2', 'Topic_3', 'Topic_4', 'Topic_5', 'Topic_6'])
In [56]:
# keywords_df.to_csv("keywords_BERT.csv", index=False)
keywords_df2 = pd.read_csv("keywords_BERT.csv")
keywords_df2
Out[56]:
0 1 2 3 4 5 6 7 8 9
0 market news us data new intelligence artificial artificial intelligence technology business
1 javascript javascript disabled current current browser configurationis seeking alpha javascript browser configurationis compatible configurationis compatible configurationis compatible site browser configurationis configurationis compatible site
2 us exports export analyzing satellite satellite images satellite new rules analyzing satellite images news
3 enter valid valid enter axios enter valid emailsubscription emailsubscription valid emailsubscription addressplease enter valid enter valid emailplease valid emailplease
4 shielding shielding materials emi market shielding materials technologies rfi shielding materials rfi shielding materials technologies emi rfi shielding emi rfi
5 market post covid update covid update global update global covid update post covid global covid update post
6 center backup software center backup backup software data center backup backup data center backup software market center market software
In [23]:
df_topic_BERT['Topic'] = topics
In [24]:
df_topic_BERT.to_csv("topics_BERT.csv", index = False)
In [25]:
df_topic_BERT = df_topic_BERT[df_topic_BERT["Topic"]!=-1]
In [26]:
df_topic_BERT.head()
Out[26]:
date clean_title clean_text sentiment sentiment_category Topic
6 2020-12-08 From the Bard to broadcaster: Stratford Festival builds new identity with streamer | National En... bard broadcaster stratford festival builds new identity streamer national entertainment penticto... 1 Positive 0
9 2020-06-14 Artificial Intelligence In Behavioral And Mental Health Care Market to Witness Astonishing Growt... artificial intelligence behavioral mental health care market witness astonishing growth focusing... 0 Negative 0
10 2020-07-10 AI/ Machine Learning Market 2020 Expected to Reach $XX Million by 2024 – IBM, BAIDU, SOUNDHOUND,... machine learning market expected reach xx million ibm baidu soundhound zebra medical vision pris... 1 Positive 0
11 2020-03-16 According to Latest Report on Machine Learning Courses Market to Grow with an Impressive CAGR: T... according latest report machine learning courses market grow impressive cagr top key players edx... 1 Positive 0
14 2023-04-06 Video Trump deepfakes on social media prompt warnings of AI risks - ABC News video trump deepfakes social media prompt warnings risks abc news abc newsvideoliveshowsguns ame... 1 Positive 0
In [27]:
# Group data by sentiment_category and Topic columns and get size of each group
grouped = df_topic_BERT.groupby(['sentiment_category', 'Topic']).size().reset_index(name='count')

# Pivot the table to have sentiment_category as columns, Topic as index, and count as values
pivot_table = grouped.pivot(index='Topic', columns='sentiment_category', values='count')

# Plot the pivot table as a stacked bar chart
pivot_table.plot(kind='bar', stacked=True)
plt.show()
In [28]:
df_topic_BERT['Topic_category'] = df_topic_BERT['Topic'].map({0: 'Technology insights & news', 1: 'Browser_script', 2:'Technology product & service', 3:'Shielding Materials', 
                                                  4:'Covid_related', 5:'classesntry', 6:'Text-related', 7:'map florida results'})
In [31]:
df_topic_BERT["Topic"].value_counts()
Out[31]:
Topic
0    150062
1       163
2        47
3        40
4        30
5        17
6        11
Name: count, dtype: int64

Articles that have a negative sentiment

In [32]:
df_neg_BERT = df_topic_BERT[df_topic_BERT['sentiment_category']=='Negative']
In [33]:
df_neg_BERT = df_neg_BERT[['clean_text']].drop_duplicates()
In [34]:
X_neg = df_neg_BERT['clean_text'].values
In [35]:
X_neg.shape
Out[35]:
(99313,)
In [36]:
%%time
model_neg_topics = BERTopic(verbose=True, nr_topics= 10,  n_gram_range=(1, 3))
neg_topics, neg_probabilities = model_neg_topics.fit_transform(X_neg)
Batches:   0%|          | 0/3104 [00:00<?, ?it/s]
2023-05-27 01:20:47,190 - BERTopic - Transformed documents to Embeddings
2023-05-27 01:22:12,397 - BERTopic - Reduced dimensionality
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2023-05-27 01:22:23,924 - BERTopic - Clustered reduced embeddings
2023-05-27 01:45:35,303 - BERTopic - Reduced number of topics from 2594 to 10
CPU times: user 2h 28min 3s, sys: 18min 44s, total: 2h 46min 47s
Wall time: 30min 27s
In [37]:
model_neg_topics.visualize_barchart()
In [42]:
import matplotlib.pyplot as plt
import numpy as np

# Create a colormap
colors = plt.cm.get_cmap('hsv', len(topics['Topic'][:6]))

fig, ax = plt.subplots(figsize=(15, 10)) # Change figure size as per your requirement

# Width of the bars
width = 0.1 

# Iterate over each topic
for i, topic in enumerate(topics['Topic'][:6]):  
    # Get topic words and their probabilities
    topic_words = model_neg_topics.get_topic(topic)

    words = [word[0] for word in topic_words]
    probs = [word[1] for word in topic_words]

    # Generate bar positions
    bar_positions = np.arange(len(words)) + i * width

    # Create bar chart for this topic
    ax.bar(bar_positions, probs, width, label=f'Topic {topic}', color=colors(i))

# Add labels, title, and legend
ax.set_xlabel('Words')
ax.set_ylabel('Probabilities')
ax.set_title('Topic Modeling')
ax.set_xticks(bar_positions - width * (len(topics['Topic'][:6]) - 1) / 2)
ax.set_xticklabels(words)
plt.xticks(rotation=90)
ax.legend()

plt.show()
In [38]:
model_neg_topics.visualize_topics()
In [43]:
topics_keywords = model_neg_topics.get_topics()
keywords_df_list = []
for i in range(len(topics_keywords)-1):
    topic_df = pd.DataFrame(topics_keywords[i])
    keywords_df_list.append(topic_df)
keywords_list = pd.concat(keywords_df_list)[0]
keywords_list_br = [keywords_list[i:i + 10] for i in range(0, len(keywords_list), 10)]
keywords_df = pd.DataFrame(keywords_list_br, index=['Topic_0', 'Topic_1', 'Topic_2', 'Topic_3', 'Topic_4', 'Topic_5', 'Topic_6', 'Topic_7', 'Topic_8'])
In [44]:
keywords_df
Out[44]:
0 1 2 3 4 5 6 7 8 9
Topic_0 market news us data new intelligence technology artificial business artificial intelligence
Topic_1 market report analysis global catheter growth research medical vascular urinary
Topic_2 data learning machine science data science deep learning developer deep course machine learning
Topic_3 american middle eastern eastern native native american african american asian hispanic eastern native american middle eastern native
Topic_4 maritime maritime risk maritime risk analysis risk analysis capability gsts risk enhance maritime enhance maritime risk solutions
Topic_5 maine maine public public maine public television public television maine public classical public classical classical podcast watch
Topic_6 likedataidtypetypestrcommentstatewithuserscontentidsubsiteidnameuuuuuuubavatarhttpsleonardoosnov... likedataidtypetypestrcommentstatewithuserscontentidsubsiteidnameuuuuuuubavatarhttpsleonardoosnov... likedataidtypetypestrcommentstatewithuserscontentidsubsiteidnameuuuuuuubavatarhttpsleonardoosnov... likedataidtypetypestrcommentstatewithuserscontentidsubsiteidnameuuuuuuubavatarhttpsleonardoosnov... likedataidtypetypestrcommentstatewithuserscontentidsubsiteidnameuuuuuuubavatarhttpsleonardoosnov... likedataidtypetypestrcommentstatewithuserscontentidsubsiteidnameuuuuuuubavatarhttpsleonardoosnov... null chatgpt likedataidtypetypestrcommentstatewithuserscontentidsubsiteidnameuuuuuuuavatarhttpsleonardoosnova... likedataidtypetypestrcommentstatewithuserscontentidsubsiteidnameuuuuuuuavatarhttpsleonardoosnova...
Topic_7 post covid update market covid update global update global covid update post covid covid global update post
Topic_8 assistive elderly assistive disabled elderly assistive disabled elderly elderly disabled market assistive technology elderly assistive technology assistive technology market

Articles that have a positive sentiment

In [45]:
df_pos_BERT = df_topic_BERT[df_topic_BERT['sentiment_category']=='Positive']
In [46]:
df_pos_BERT = df_pos_BERT[['clean_text']].drop_duplicates()
In [47]:
X_pos = df_pos_BERT['clean_text'].values
In [48]:
X_pos.shape
Out[48]:
(49462,)
In [49]:
%%time
model_pos_topics = BERTopic(verbose=True, nr_topics= 10,  n_gram_range=(1, 3))
pos_topics, pos_probabilities = model_pos_topics.fit_transform(X_pos)
Batches:   0%|          | 0/1546 [00:00<?, ?it/s]
2023-05-27 02:07:15,861 - BERTopic - Transformed documents to Embeddings
2023-05-27 02:07:56,752 - BERTopic - Reduced dimensionality
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2023-05-27 02:08:03,546 - BERTopic - Clustered reduced embeddings
2023-05-27 02:20:28,615 - BERTopic - Reduced number of topics from 1289 to 10
CPU times: user 1h 8min 58s, sys: 7min 1s, total: 1h 15min 59s
Wall time: 16min 11s
In [50]:
model_pos_topics.visualize_barchart()
In [51]:
model_pos_topics.visualize_topics()
In [52]:
topics_keywords = model_pos_topics.get_topics()
keywords_df_list = []
for i in range(len(topics_keywords)-1):
    topic_df = pd.DataFrame(topics_keywords[i])
    keywords_df_list.append(topic_df)
keywords_list = pd.concat(keywords_df_list)[0]
keywords_list_br = [keywords_list[i:i + 10] for i in range(0, len(keywords_list), 10)]
keywords_df = pd.DataFrame(keywords_list_br, index=['Topic_0', 'Topic_1', 'Topic_2', 'Topic_3', 'Topic_4', 'Topic_5', 'Topic_6', 'Topic_7', 'Topic_8'])
In [53]:
keywords_df
Out[53]:
0 1 2 3 4 5 6 7 8 9
Topic_0 market intelligence news artificial us artificial intelligence data global report new
Topic_1 market report global analysis medical research growth catheters key vascular
Topic_2 gpu nvidia performance rtx modules mxm gpu gpu modules mxm dihuni mxm gpu modules
Topic_3 camera camera market market fever detector thermal detector thermal camera thermal camera pixels fever detector thermal fever detector
Topic_4 gartner enter valid testing validation valid cool independent testing validation independent testing calypsoai validation testing
Topic_5 market hemostat disposable hemostat disposable global report brachytherapy analysis growth seeds market
Topic_6 javascript disabledplease disabledplease enable javascript currently disabledplease currently disabledplease currently disabledplease enable jumi disabledplease enable better experience jumi better experience jumi
Topic_7 thrive global thrive global use people intelligence businesses work facebookfollow thrive facebookfollow thrive global
Topic_8 shielding shielding materials emi shielding materials technologies rfi shielding materials rfi shielding materials technologies emi rfi emi rfi shielding rfi
In [54]:
# group the DataFrame by sentiment and topic category, count the occurrences of clean_text, and reset the index
temp = df_topic_BERT.groupby(['sentiment_category', 'Topic_category'])['clean_text'].count().reset_index()

# rename the count column to reflect the sentiment category and set the index to topic category
temp = temp.rename(columns={'clean_text': 'count'}).set_index('Topic_category')

# filter the DataFrame by sentiment category and rename the count column
temp_neg = temp[temp['sentiment_category'] == 'Negative'].rename(columns={'count': 'count_of_negative_articles'})
temp_pos = temp[temp['sentiment_category'] == 'Positive'].rename(columns={'count': 'count_of_positive_articles'})

# concatenate the two DataFrames and select only the relevant columns
topics_by_sentiment = pd.concat([temp_neg, temp_pos], axis=1)[['count_of_negative_articles', 'count_of_positive_articles']]

# sort the DataFrame by the count of positive articles in descending order
topics_by_sentiment = topics_by_sentiment.sort_values(by='count_of_positive_articles', ascending=False)
In [55]:
# Dataframe showing number of positive and negative articles respectively per topic
topics_by_sentiment
Out[55]:
count_of_negative_articles count_of_positive_articles
Topic_category
Technology insights & news 100253 49809
Browser_script 121 42
Covid_related 13 17
Technology product & service 30 17
Shielding Materials 26 14
Text-related 5 6
classesntry 14 3

Analyzing certain companies

1) ChatGPT

In [57]:
# define the keywords to search for
keywords = ['chatGPT','chatgpt']

# join the keywords using the OR operator '|'
query = '|'.join(keywords)

# filter the DataFrame by the query string
tp_chatGPT = df_topic_BERT[df_topic_BERT['clean_title'].str.contains(query)]

# count the occurrences of sentiment category and transpose the DataFrame
cnt_chatGPT = pd.DataFrame(tp_chatGPT['sentiment_category'].value_counts()).rename(columns={'sentiment_category': 'chatGPT'}).T

# calculate the total number of articles and add it as a new column
cnt_chatGPT['total'] = cnt_chatGPT['Negative'] + cnt_chatGPT['Positive']

# calculate the percentage of negative and positive articles and round to two decimal places
cnt_chatGPT['Negative%'] = np.round(100 * cnt_chatGPT['Negative'] / cnt_chatGPT['total'], 2)
cnt_chatGPT['Positive%'] = np.round(100 * cnt_chatGPT['Positive'] / cnt_chatGPT['total'], 2)
In [58]:
cnt_chatGPT
Out[58]:
sentiment_category Negative Positive total Negative% Positive%
count 27 23 50 54.0 46.0

2) Google

In [59]:
# define the keywords to search for
keywords = ['google', 'Google']

# join the keywords using the OR operator '|'
query = '|'.join(keywords)

# filter the DataFrame by the query string
tp_google = df_topic_BERT[df_topic_BERT['clean_title'].str.contains(query)]

# count the occurrences of sentiment category and transpose the DataFrame
cnt_google = pd.DataFrame(tp_google['sentiment_category'].value_counts()).rename(columns={'sentiment_category': 'Google'}).T

# calculate the total number of articles and add it as a new column
cnt_google['total'] = cnt_google['Negative'] + cnt_google['Positive']

# calculate the percentage of negative and positive articles and round to two decimal places
cnt_google['Negative%'] = np.round(100 * cnt_google['Negative'] / cnt_google['total'], 2)
cnt_google['Positive%'] = np.round(100 * cnt_google['Positive'] / cnt_google['total'], 2)
In [60]:
cnt_google
Out[60]:
sentiment_category Negative Positive total Negative% Positive%
count 5981 2508 8489 70.46 29.54

3) Microsoft

In [61]:
# define the keywords to search for
keywords = ['microsoft', 'Microsoft', 'MS', 'ms']

# join the keywords using the OR operator '|'
query = '|'.join(keywords)

# filter the DataFrame by the query string
tp_microsoft = df_topic_BERT[df_topic_BERT['clean_title'].str.contains(query)]

# count the occurrences of sentiment category and transpose the DataFrame
cnt_microsoft = pd.DataFrame(tp_microsoft['sentiment_category'].value_counts()).rename(columns={'sentiment_category': 'Microsoft'}).T

# calculate the total number of articles and add it as a new column
cnt_microsoft['total'] = cnt_microsoft['Negative'] + cnt_microsoft['Positive']

# calculate the percentage of negative and positive articles and round to two decimal places
cnt_microsoft['Negative%'] = np.round(100 * cnt_microsoft['Negative'] / cnt_microsoft['total'], 2)
cnt_microsoft['Positive%'] = np.round(100 * cnt_microsoft['Positive'] / cnt_microsoft['total'], 2)
In [62]:
cnt_microsoft
Out[62]:
sentiment_category Negative Positive total Negative% Positive%
count 10401 5555 15956 65.19 34.81
In [ ]: